## Time Series:
## Start = c(1, 4)
## End = c(4, 2)
## Frequency = 7
## Demand WorkDay Temperature
## 1.428571 174.8963 0 26.0
## 1.571429 188.5909 1 23.0
## 1.714286 188.9169 1 22.2
## 1.857143 173.8142 0 20.3
## 2.000000 169.5152 0 26.1
## 2.142857 195.7288 1 19.6
## 2.285714 199.9029 1 20.0
## 2.428571 205.3375 1 27.4
## 2.571429 228.0782 1 32.4
## 2.714286 258.5984 1 34.0
## 2.857143 201.7970 0 22.4
## 3.000000 187.6298 0 22.5
## 3.142857 254.6636 1 30.0
## 3.285714 322.2323 1 42.4
## 3.428571 343.9934 1 41.5
## 3.571429 347.6376 1 43.2
## 3.714286 332.9455 1 43.1
## 3.857143 219.7517 0 23.7
## 4.000000 186.9816 0 22.3
## 4.142857 228.4876 1 24.0
##
## Call:
## tslm(formula = Demand ~ Temperature, data = daily20)
##
## Coefficients:
## (Intercept) Temperature
## 39.212 6.757
checkresiduals(tslm_Dem_Temp$residuals)
## Warning in modeldf.default(object): Could not find appropriate degrees of
## freedom for this model.
# I think that this model is adequate because residuals aren't correlated with each other. But there was an outlier.
fc_Dem_Temp <- forecast(tslm_Dem_Temp,
newdata=data.frame(Temperature=c(15,35)))
fc_Dem_Temp
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 4.285714 140.5701 108.6810 172.4591 90.21166 190.9285
## 4.428571 275.7146 245.2278 306.2014 227.57056 323.8586
# I think that the model forecasted rightly because the forecasted temperature values were near to the range of temperatures in the data
# 80% intervals
fc_Dem_Temp$upper[, 1]
## Time Series:
## Start = c(4, 3)
## End = c(4, 4)
## Frequency = 7
## [1] 172.4591 306.2014
fc_Dem_Temp$lower[, 1]
## Time Series:
## Start = c(4, 3)
## End = c(4, 4)
## Frequency = 7
## [1] 108.6810 245.2278
# 95% intervals
fc_Dem_Temp$upper[, 2]
## Time Series:
## Start = c(4, 3)
## End = c(4, 4)
## Frequency = 7
## [1] 190.9285 323.8586
fc_Dem_Temp$lower[, 2]
## Time Series:
## Start = c(4, 3)
## End = c(4, 4)
## Frequency = 7
## [1] 90.21166 227.57056
elecdaily %>%
as.data.frame() %>%
ggplot(aes(x=Temperature, y=Demand)) +
ylab("Electricity Demand") +
xlab("Temperature") +
geom_point() +
geom_smooth(method="lm", se=FALSE)
## `geom_smooth()` using formula 'y ~ x'
# The result plot indicates that the model was made by few data points. It could've explained the data of the first 20 days well, but it wasn't right model for total data points
# a. Plot the winning time against the year. Describe the main features of the plot.
autoplot(mens400)
# Feature1. Winning times in Olympic men's 400m track final had the trend of decreasing as time went on.
# Feature2. There are missing values.
# Extract time part from mens400 time series to do linear modeling.
time_mens400 <- time(mens400)
tslm_mens400 <- tslm(mens400 ~ time_mens400,
data = mens400)
# Show data with regression line
autoplot(mens400) +
geom_abline(slope = tslm_mens400$coefficients[2],
intercept = tslm_mens400$coefficients[1],
colour = "red")
# Get the winning time decreasing rate
tslm_mens400$coefficients[2]
## time_mens400
## -0.06457385
# The winning times have been decreasing at average rate of 0.06457 second per year.
cbind(Time = time_mens400,
Residuals = tslm_mens400$residuals) %>%
as.data.frame() %>%
ggplot(aes(x = Time, y = Residuals)) +
geom_point() +
ylab("Residuals of Regression Line(Unit:s)")
## Warning: Removed 3 rows containing missing values (geom_point).
# The residual plot shows that the regression model generally fitted the data well. I can check it using checkresiduals function, too.
checkresiduals(tslm_mens400)
##
## Breusch-Godfrey test for serial correlation of order up to 6
##
## data: Residuals from Linear regression model
## LM test = 3.6082, df = 6, p-value = 0.7295
# I made linear model with na.action argument as na.exclude to exclude missing values.
# And then I used the linear model in forecast function to get prediction interval.
# Forecast function can't calculate prediction interval when there is any missing values in the data that I excluded them fitting linear model.
lm_mens400 <- lm(
mens400 ~ time_mens400,
data = mens400,
na.action = na.exclude
)
fc_mens400 <- forecast(
lm_mens400,
newdata = data.frame(time_mens400 = 2020)
)
autoplot(mens400) +
autolayer(fc_mens400, PI = TRUE)
# Get 80% and 95% prediction intervals
fc_mens400$upper
## [,1] [,2]
## [1,] 43.63487 44.53176
fc_mens400$lower
## [,1] [,2]
## [1,] 40.44975 39.55286
# 80% interval is from 40.45 to 43.63
# 95% interval is from 39.55 to 44.53
# But we need to consider that they were calculated from the assumption that the model's residuals were normally distributed. But we saw from the result of checkresiduals function that it isn't true.
## Qtr1 Qtr2 Qtr3 Qtr4
## 1956 284 213 227 308
## 1957 262 228
## Time-Series [1:218] from 1956 to 2010: 284 213 227 308 262 228 236 320 272 233 ...
## [1] 1956.00 2010.25
## Qtr1 Qtr2 Qtr3 Qtr4
## 1956 0.67 0.33 0.00 0.00
## 1957 0.00 1.00 0.00 0.00
## 1958 0.00 1.00 0.00 0.00
## 1959 1.00 0.00 0.00 0.00
## 1960 0.00 1.00 0.00 0.00
## 1961 0.33 0.67 0.00 0.00
## 1962 0.00 1.00 0.00 0.00
## 1963 0.00 1.00 0.00 0.00
## 1964 1.00 0.00 0.00 0.00
## 1965 0.00 1.00 0.00 0.00
## 1966 0.00 1.00 0.00 0.00
## 1967 1.00 0.00 0.00 0.00
## 1968 0.00 1.00 0.00 0.00
## 1969 0.00 1.00 0.00 0.00
## 1970 1.00 0.00 0.00 0.00
## 1971 0.00 1.00 0.00 0.00
## 1972 0.33 0.67 0.00 0.00
## 1973 0.00 1.00 0.00 0.00
## 1974 0.00 1.00 0.00 0.00
## 1975 1.00 0.00 0.00 0.00
## 1976 0.00 1.00 0.00 0.00
## 1977 0.00 1.00 0.00 0.00
## 1978 1.00 0.00 0.00 0.00
## 1979 0.00 1.00 0.00 0.00
## 1980 0.00 1.00 0.00 0.00
## 1981 0.00 1.00 0.00 0.00
## 1982 0.00 1.00 0.00 0.00
## 1983 0.00 1.00 0.00 0.00
## 1984 0.00 1.00 0.00 0.00
## 1985 0.00 1.00 0.00 0.00
## 1986 1.00 0.00 0.00 0.00
## 1987 0.00 1.00 0.00 0.00
## 1988 0.00 1.00 0.00 0.00
## 1989 1.00 0.00 0.00 0.00
## 1990 0.00 1.00 0.00 0.00
## 1991 1.00 0.00 0.00 0.00
## 1992 0.00 1.00 0.00 0.00
## 1993 0.00 1.00 0.00 0.00
## 1994 0.00 1.00 0.00 0.00
## 1995 0.00 1.00 0.00 0.00
## 1996 0.00 1.00 0.00 0.00
## 1997 1.00 0.00 0.00 0.00
## 1998 0.00 1.00 0.00 0.00
## 1999 0.00 1.00 0.00 0.00
## 2000 0.00 1.00 0.00 0.00
## 2001 0.00 1.00 0.00 0.00
## 2002 1.00 0.00 0.00 0.00
## 2003 0.00 1.00 0.00 0.00
## 2004 0.00 1.00 0.00 0.00
## 2005 1.00 0.00 0.00 0.00
## 2006 0.00 1.00 0.00 0.00
## 2007 0.00 1.00 0.00 0.00
## 2008 1.00 0.00 0.00 0.00
## 2009 0.00 1.00 0.00 0.00
## 2010 0.00 1.00
An elasticity coefficient is the ratio of the percentage change in the forecast variable \((y)\) to the percentage change in the predictor variable ( \(x\) ). Mathematically, the elasticity is defined as \((d y / d x) \times(x / y)\). Consider the log-log model, \[ \log y=\beta_{0}+\beta_{1} \log x+\varepsilon \] # Express \(y\) as a function of \(x\) and show that the coefficient \(\beta_{1}\) is the elasticity coefficient.
autoplot(fancy)
head(fancy, 50)
## Jan Feb Mar Apr May Jun Jul Aug
## 1987 1664.81 2397.53 2840.71 3547.29 3752.96 3714.74 4349.61 3566.34
## 1988 2499.81 5198.24 7225.14 4806.03 5900.88 4951.34 6179.12 4752.15
## 1989 4717.02 5702.63 9957.58 5304.78 6492.43 6630.80 7349.62 8176.62
## 1990 5921.10 5814.58 12421.25 6369.77 7609.12 7224.75 8121.22 7979.25
## 1991 4826.64 6470.23
## Sep Oct Nov Dec
## 1987 5021.82 6423.48 7600.60 19756.21
## 1988 5496.43 5835.10 12600.08 28541.72
## 1989 8573.17 9690.50 15151.84 34061.01
## 1990 8093.06 8476.70 17914.66 30114.41
## 1991
# Sales generally increased from January to December. Sales increased dramatically in Decembers. Sales in Decembers increased as time went on, but in 1991, sales decreased. In most years, there was also unexpected increase every March, but the increases were a lot smaller than the increases in Decembers.
# The size of the seasonal variations should be almost same across the whole series to be fitted well to a model. Fancy data shows that seasonal variations increased exponentially. Therefore it is necessary to take logarithms of the data.
# make "surfing_festival" dummy variable using time index of fancy. The value is 1 if the year is equal to or above 1988 and the month is March.
Time <- time(fancy)
surfing_festival <- c()
for(i in 1:length(Time)){
month <- round(12*(Time[i] - floor(Time[i]))) + 1
year <- floor(Time[i])
if(year >= 1988 & month == 3){
surfing_festival[i] <- 1
} else {
surfing_festival[i] <- 0
}
}
# If I had made surfing_festival as a list, I should've needed to use unlist function to make it as atomic vector, not nested list. tslm function can get vector or factor type data, but it cannot get nested list.
tslm_log_fancy <- tslm(
BoxCox(fancy, 0) ~ trend + season + surfing_festival
)
autoplot(tslm_log_fancy$residuals)
# The residuals have pattern against time. It means that there is correlation between residuals and time.
cbind(Residuals = tslm_log_fancy$residuals,
Fitted_values = tslm_log_fancy$fitted.values) %>%
as.data.frame() %>%
ggplot(aes(x = Fitted_values,
y = Residuals)) +
geom_point()
# The size of the residuals changed as we move along the x-axis(fitted values). It means that even after log transformation, there are still heteroscedacity in the errors or that the variance of the residuals aren't still constant.
cbind.data.frame(
Month = factor(
month.abb[round(12*(Time - floor(Time)) + 1)],
labels = month.abb,
ordered = TRUE
),
Residuals = tslm_log_fancy$residuals
) %>%
ggplot(aes(x = Month,
y = Residuals)) +
geom_boxplot()
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.
# If vectors are combined by cbind function, the class of the result is matrix, which should hold one type of data. If I want to make the columns to have different data types, I need to use cbind.data.frame function instead. Instead, if I still want to use cbind function, I need to use as.numeric function in mapping of ggplot.
# If the mapping of boxplot is (factor x factor), it would be difficult to see any box because boxplot function can't aggregate factor type data. The result would be looked like a scatter plot.
# To see the change of the residuals for each month, I used ggsubseriesplot function.
ggsubseriesplot(tslm_log_fancy$residuals)
# The distribution of the residuals was unsymetrical for some months. And for some months, the median of the residuals wasn't 0(residuals' mean should be 0 for all months because getting the minimum SSE means getting mean). Residuals with such properties can't have normal distribution, which will make it difficult to get prediction interval.
tslm_log_fancy$coefficients
## (Intercept) trend season2 season3
## 7.61966701 0.02201983 0.25141682 0.26608280
## season4 season5 season6 season7
## 0.38405351 0.40948697 0.44882828 0.61045453
## season8 season9 season10 season11
## 0.58796443 0.66932985 0.74739195 1.20674790
## season12 surfing_festival
## 1.96224123 0.50151509
# The model has positive trend. It means that as time goes on, the sales amount generally increases.
# And all seasonal variables are positive. It means that the sales amount was minimum on January and the sales of the other months were greater than January for most of years.
# Finally, surfing_festival variable's coefficient is 0.501 and it isn't small compared to the others. It means that there were increased sales in Marchs when surfing festival happened.
checkresiduals(tslm_log_fancy)
##
## Breusch-Godfrey test for serial correlation of order up to 17
##
## data: Residuals from Linear regression model
## LM test = 37.954, df = 17, p-value = 0.002494
# The p value of the test is less than 0.05. It means that the residuals can be distinguished from white noise. The residuals can be correlated with each other.
# h. Regardless of your answers to the above questions, use your regression model to predict the monthly sales for 1994, 1995, and 1996. Produce prediction intervals for each of your forecasts.
# make surfing festival data for the months of 1994 through 1996.
future_fancy <- rep(0, 36)
for(i in 1:36){
if(i %% 12 == 3){
future_fancy[i] <- 1
}
}
# make future data as time series.
future_fancy <- ts(data = future_fancy,
start = 1994,
end = c(1996, 12),
frequency = 12)
# forecast
fc_tslm_log_fancy <- forecast(
tslm_log_fancy,
newdata = data.frame(Time = time(future_fancy),
surfing_festival = future_fancy)
)
# plot the forecast
autoplot(fc_tslm_log_fancy)
# show prediction interval
fc_tslm_log_fancy$upper
## [,1] [,2]
## Jan 1994 9.744183 9.88111
## Feb 1994 10.017620 10.15455
## Mar 1994 10.557120 10.69475
## Apr 1994 10.194296 10.33122
## May 1994 10.241749 10.37868
## Jun 1994 10.303110 10.44004
## Jul 1994 10.486756 10.62368
## Aug 1994 10.486286 10.62321
## Sep 1994 10.589671 10.72660
## Oct 1994 10.689753 10.82668
## Nov 1994 11.171129 11.30806
## Dec 1994 11.948642 12.08557
## Jan 1995 10.011336 10.14984
## Feb 1995 10.284773 10.42328
## Mar 1995 10.823938 10.96297
## Apr 1995 10.461449 10.59996
## May 1995 10.508903 10.64741
## Jun 1995 10.570264 10.70877
## Jul 1995 10.753910 10.89242
## Aug 1995 10.753440 10.89195
## Sep 1995 10.856825 10.99533
## Oct 1995 10.956907 11.09541
## Nov 1995 11.438282 11.57679
## Dec 1995 12.215796 12.35430
## Jan 1996 10.279093 10.41951
## Feb 1996 10.552530 10.69294
## Mar 1996 11.091365 11.23212
## Apr 1996 10.729206 10.86962
## May 1996 10.776659 10.91707
## Jun 1996 10.838021 10.97843
## Jul 1996 11.021667 11.16208
## Aug 1996 11.021196 11.16161
## Sep 1996 11.124582 11.26499
## Oct 1996 11.224664 11.36508
## Nov 1996 11.706039 11.84645
## Dec 1996 12.483552 12.62396
fc_tslm_log_fancy$lower
## [,1] [,2]
## Jan 1994 9.238522 9.101594
## Feb 1994 9.511959 9.375031
## Mar 1994 10.048860 9.911228
## Apr 1994 9.688635 9.551707
## May 1994 9.736088 9.599161
## Jun 1994 9.797449 9.660522
## Jul 1994 9.981095 9.844168
## Aug 1994 9.980625 9.843698
## Sep 1994 10.084010 9.947083
## Oct 1994 10.184092 10.047165
## Nov 1994 10.665468 10.528541
## Dec 1994 11.442981 11.306054
## Jan 1995 9.499844 9.361338
## Feb 1995 9.773281 9.634775
## Mar 1995 10.310518 10.171489
## Apr 1995 9.949957 9.811451
## May 1995 9.997411 9.858904
## Jun 1995 10.058772 9.920265
## Jul 1995 10.242418 10.103911
## Aug 1995 10.241948 10.103441
## Sep 1995 10.345333 10.206826
## Oct 1995 10.445415 10.306908
## Nov 1995 10.926791 10.788284
## Dec 1995 11.704304 11.565797
## Jan 1996 9.760564 9.620151
## Feb 1996 10.034000 9.893588
## Mar 1996 10.571566 10.430810
## Apr 1996 10.210677 10.070264
## May 1996 10.258130 10.117718
## Jun 1996 10.319491 10.179079
## Jul 1996 10.503137 10.362725
## Aug 1996 10.502667 10.362254
## Sep 1996 10.606052 10.465640
## Oct 1996 10.706134 10.565722
## Nov 1996 11.187510 11.047097
## Dec 1996 11.965023 11.824611
# The intervals on Decembers were especially large.
# make fc_tslm_fancy object, which are inverse log transformed version of fc_tslm_log_fancy.
fc_tslm_fancy <- fc_tslm_log_fancy
# members which should be inverse log transformed.
members_inv.log <- c('x', 'mean', 'lower', 'upper', 'residuals', 'fitted')
# apply inverse log transform to the members.
fc_tslm_fancy[members_inv.log] <- lapply(
fc_tslm_log_fancy[members_inv.log],
InvBoxCox,
lambda = 0
)
# apply inverse log transform to 'BoxCox(fancy, 0)' member in model's model.
fc_tslm_fancy[['model']][['model']][1] <- lapply(
fc_tslm_log_fancy[['model']][['model']][1],
InvBoxCox,
lambda = 0
)
autoplot(fc_tslm_fancy)
# Even if I transformed the data, fitted values and forecasts, the name of predicted values is still 'BoxCox(fancy, 0)'.
# I can't change it because it came from the formula in tslm function. Changing it means making new model, not just changing the variable's name.
# I think that it is better to set lambda = 0 in forecast function from the very first to forecast using log transformation.
fc_tslm_fancy$upper
## [,1] [,2]
## Jan 1994 17054.73 19557.43
## Feb 1994 22418.00 25707.73
## Mar 1994 38450.24 44123.68
## Apr 1994 26750.16 30675.62
## May 1994 28050.15 32166.37
## Jun 1994 29825.24 34201.95
## Jul 1994 35837.72 41096.73
## Aug 1994 35820.87 41077.41
## Sep 1994 39722.43 45551.50
## Oct 1994 43903.67 50346.32
## Nov 1994 71049.28 81475.41
## Dec 1994 154607.08 177294.90
## Jan 1995 22277.59 25587.08
## Feb 1995 29283.31 33633.55
## Mar 1995 50208.44 57697.39
## Apr 1995 34942.16 40133.06
## May 1995 36640.25 42083.42
## Jun 1995 38958.95 44746.58
## Jul 1995 46812.70 53767.06
## Aug 1995 46790.69 53741.78
## Sep 1995 51887.06 59595.26
## Oct 1995 57348.77 65868.34
## Nov 1995 92807.48 106594.69
## Dec 1995 201954.08 231955.81
## Jan 1996 29117.46 33506.86
## Feb 1996 38274.14 44043.89
## Mar 1996 65602.25 75517.62
## Apr 1996 45670.42 52555.15
## May 1996 47889.88 55109.18
## Jun 1996 50920.48 58596.65
## Jul 1996 61185.57 70409.18
## Aug 1996 61156.80 70376.07
## Sep 1996 67817.91 78041.33
## Oct 1996 74956.52 86256.08
## Nov 1996 121302.09 139588.15
## Dec 1996 263959.89 303751.35
fc_tslm_fancy$lower
## [,1] [,2]
## Jan 1994 10285.82 8969.583
## Feb 1994 13520.45 11790.284
## Mar 1994 23129.40 20155.412
## Apr 1994 16133.21 14068.696
## May 1994 16917.24 14752.395
## Jun 1994 17987.81 15685.969
## Jul 1994 21613.98 18848.111
## Aug 1994 21603.82 18839.249
## Sep 1994 23956.87 20891.193
## Oct 1994 26478.61 23090.230
## Nov 1994 42850.31 37366.903
## Dec 1994 93244.59 81312.400
## Jan 1995 13357.65 11629.938
## Feb 1995 17558.28 15287.252
## Mar 1995 30046.98 26146.972
## Apr 1995 20951.33 18241.435
## May 1995 21969.51 19127.918
## Jun 1995 23359.80 20338.387
## Jul 1995 28068.91 24438.412
## Aug 1995 28055.72 24426.922
## Sep 1995 31111.50 27087.467
## Oct 1995 34386.35 29938.733
## Nov 1995 55647.40 48449.831
## Dec 1995 121091.75 105429.448
## Jan 1996 17336.40 15065.329
## Feb 1996 22788.25 19802.984
## Mar 1996 39009.73 33887.802
## Apr 1996 27191.96 23629.808
## May 1996 28513.41 24778.151
## Jun 1996 30317.82 26346.183
## Jul 1996 36429.60 31657.322
## Aug 1996 36412.48 31642.439
## Sep 1996 40378.47 35088.887
## Oct 1996 44628.77 38782.394
## Nov 1996 72222.70 62761.521
## Dec 1996 157160.50 136572.460
# The range of prediction intervals became a lot bigger after inverse log transformation.
# The predictions when I don't use log transformation.
tslm_fancy <- tslm(
fancy ~ trend + season + surfing_festival
)
fc_tslm_fancy2 <- forecast(
tslm_fancy,
newdata = data.frame(Time = time(future_fancy),
surfing_festival = future_fancy)
)
autoplot(fc_tslm_fancy2)
# The result shows that the forecasts couldn't reflect the exponential growth trend.
# I could've improved the predictions by using log transformation. By using the transformation, the predictions could reflect the sales' exponential growth trend better.
str(gasoline)
## Time-Series [1:1355] from 1991 to 2017: 6.62 6.43 6.58 7.22 6.88 ...
head(gasoline)
## Time Series:
## Start = 1991.1
## End = 1991.19582477755
## Frequency = 52.1785714285714
## [1] 6.621 6.433 6.582 7.224 6.875 6.947
# They are weekly data and it would be useful to make model with harmonic regression.
# extract gasoline data up to the end of 2004 and plot it
gasoline_until_2004 <- window(gasoline, end = 2005)
autoplot(gasoline_until_2004, xlab = "Year")
# make tslm model
for(num in c(1, 2, 3, 5, 10, 20)){
#make variable names for each model using pair number.
var_name <- paste("tslm_ft",
as.character(num),
"_gasoline_until_2004",
sep = "")
#assign ts linear model to each variable name.
assign(var_name,
tslm(gasoline_until_2004 ~ trend + fourier(
gasoline_until_2004,
K = num
))
)
#plot the data and fitted values
print(
autoplot(gasoline_until_2004) +
autolayer(get(var_name)$fitted.values,
series = as.character(num)) +
ggtitle(var_name) +
ylab("gasoline") +
guides(colour = guide_legend(title = "Number of Fourier Transform pairs"))
)
}
autoplot(gasoline_until_2004) +
autolayer(tslm_ft1_gasoline_until_2004$fitted.values, series = "1") +
autolayer(tslm_ft5_gasoline_until_2004$fitted.values, series = "2") +
autolayer(tslm_ft10_gasoline_until_2004$fitted.values, series = "3") +
autolayer(tslm_ft10_gasoline_until_2004$fitted.values, series = "5") +
autolayer(tslm_ft20_gasoline_until_2004$fitted.values, series = "10") +
autolayer(tslm_ft20_gasoline_until_2004$fitted.values, series = "20") +
guides(colour = guide_legend(title = "Fourier Transform pairs")) +
scale_color_discrete(breaks = c(1, 2, 3, 5, 10, 20))
# as more number of Fourier pairs used, the fitted line looks more like the original data. But the fitted lines didn't follow the trend well.
for(i in c(1, 2, 3, 5, 10, 20)){
tslm_ft_gasoline_until_2004.name <- paste(
"tslm_ft", as.character(i), "_gasoline_until_2004",
sep = ""
)
writeLines(
paste(
"\n", tslm_ft_gasoline_until_2004.name, "\n"
)
)
print(CV(get(tslm_ft_gasoline_until_2004.name)))
}
##
## tslm_ft1_gasoline_until_2004
##
## CV AIC AICc BIC AdjR2
## 8.201921e-02 -1.813292e+03 -1.813208e+03 -1.790354e+03 8.250858e-01
##
## tslm_ft2_gasoline_until_2004
##
## CV AIC AICc BIC AdjR2
## 8.136854e-02 -1.819113e+03 -1.818957e+03 -1.787001e+03 8.269569e-01
##
## tslm_ft3_gasoline_until_2004
##
## CV AIC AICc BIC AdjR2
## 7.658846e-02 -1.863085e+03 -1.862834e+03 -1.821797e+03 8.375702e-01
##
## tslm_ft5_gasoline_until_2004
##
## CV AIC AICc BIC AdjR2
## 7.553646e-02 -1.873234e+03 -1.872723e+03 -1.813596e+03 8.406928e-01
##
## tslm_ft10_gasoline_until_2004
##
## CV AIC AICc BIC AdjR2
## 7.135754e-02 -1.915014e+03 -1.913441e+03 -1.809500e+03 8.516102e-01
##
## tslm_ft20_gasoline_until_2004
##
## CV AIC AICc BIC AdjR2
## 7.223834e-02 -1.908017e+03 -1.902469e+03 -1.710753e+03 8.540588e-01
# In the above 6 K values, 10 minimized the AICc and CV value.
# Get exact number of Fourier pairs to minimize AICc or CV
min_AICc <- Inf
min_K_by_AICc <- 0
min_CV <- Inf
min_K_by_CV <- 0
AICc_K <- 0
CV_K <- 0
# maximum number of pairs is 26 because the frequency of gasoline data is about 52.18
for(num in 1:26){
AICc_K <- CV(
tslm(
gasoline_until_2004 ~ trend + fourier(gasoline_until_2004, K = num)
)
)[["AICc"]]
print(AICc_K)
CV_K <- CV(
tslm(
gasoline_until_2004 ~ trend + fourier(gasoline_until_2004, K = num)
)
)[["CV"]]
print(CV_K)
# If the minimum AICc and CV values are found, the loop don't need to run anymore. Therefore print the result number of pairs and break the loop.
# If num = 1, don't run below codes and move to num = 2. With just the result of num = 1, I cannot know whether the AICc and CV values are minimum.
if(num != 1){
if(AICc_K >= min_AICc & CV_K >= min_CV){
writeLines(
paste("The number of Fourier Transform pairs to minimize AICc",
"\n",
as.character(min_K_by_AICc)
)
)
writeLines(
paste("The number of Fourier Transform pairs to minimize CV",
"\n",
as.character(min_K_by_CV)
)
)
break
}
}
# find the minimum AICc and CV and the number of pairs at the state.
if(AICc_K < min_AICc){
min_AICc <- AICc_K
min_K_by_AICc <- num
}
if(CV_K < min_CV){
min_CV <- CV_K
min_K_by_CV <- num
}
}
## [1] -1813.208
## [1] 0.08201921
## [1] -1818.957
## [1] 0.08136854
## [1] -1862.834
## [1] 0.07658846
## [1] -1863.742
## [1] 0.07648608
## [1] -1872.723
## [1] 0.07553646
## [1] -1902.077
## [1] 0.0725333
## [1] -1912.672
## [1] 0.0714734
## [1] -1912.542
## [1] 0.07147431
## The number of Fourier Transform pairs to minimize AICc
## 7
## The number of Fourier Transform pairs to minimize CV
## 7
# To get minimum AICc or CV, I need 7 pairs.
tslm_ft7_gasoline_until_2004 <- tslm(
gasoline_until_2004 ~ trend + fourier(
gasoline_until_2004,
K = 7
)
)
checkresiduals(tslm_ft7_gasoline_until_2004)
##
## Breusch-Godfrey test for serial correlation of order up to 104
##
## data: Residuals from Linear regression model
## LM test = 149.31, df = 104, p-value = 0.002404
fc_gasoline_2005 <- forecast(
tslm_ft7_gasoline_until_2004,
newdata=data.frame(fourier(
gasoline_until_2004, K = 7, h = 52)
)
)
# where tslm_ft7_gasoline_until_2004 is the fitted model using tslm, K is the number of Fourier terms used in creating fit, and h is the forecast horizon required. Got the next year's forecasts.
autoplot(fc_gasoline_2005) +
autolayer(window(
gasoline,
start = 2004,
end = 2006
)
) +
scale_x_continuous(limits = c(2004, 2006))
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## Warning: Removed 674 row(s) containing missing values (geom_path).
# Almost all of actual data were in the 80% prediction interval. But the model couldn't predict the sudden fall in the fall. The drop was a lot bigger than expected.
## Time-Series [1:98] from 1875 to 1972: 10.38 11.86 10.97 10.8 9.79 ...
## Time Series:
## Start = 1875
## End = 1880
## Frequency = 1
## [1] 10.38 11.86 10.97 10.80 9.79 10.39
Using matrix notation it was shown that if \(\boldsymbol{y}=\boldsymbol{X} \boldsymbol{\beta}+\boldsymbol{\varepsilon}\), where \(\boldsymbol{e}\) has mean \(\mathbf{0}\) and variance matrix \(\sigma^{2} \boldsymbol{I}\), the estimated coefficients are given by \(\hat{\boldsymbol{\beta}}=\left(\boldsymbol{X}^{\prime} \boldsymbol{X}\right)^{-1} \boldsymbol{X}^{\prime} \boldsymbol{y}\) and a forecast is given by \(\hat{y}=\boldsymbol{x}^{*} \hat{\boldsymbol{\beta}}=\boldsymbol{x}^{*}\left(\boldsymbol{X}^{\prime} \boldsymbol{X}\right)^{-1} \boldsymbol{X}^{\prime} \boldsymbol{y}\) where \(\boldsymbol{x}^{*}\) is a row vector containing the values of the regressors for the forecast (in the same format as \(\boldsymbol{X})\), and the forecast variance is given by \(\operatorname{var}(\hat{y})=\sigma^{2}\left[1+\boldsymbol{x}^{*}\left(\boldsymbol{X}^{\prime} \boldsymbol{X}\right)^{-1}\left(\boldsymbol{x}^{*}\right)^{\prime}\right] .\) Consider the simple time trend model where \(y_{t}=\beta_{0}+\beta_{1} t\). Using the following results, \[ \sum_{t=1}^{T} t=\frac{1}{2} T(T+1), \quad \sum_{t=1}^{T} t^{2}=\frac{1}{6} T(T+1)(2 T+1) \] derive the following expressions: a. \(\boldsymbol{X}^{\prime} \boldsymbol{X}=\frac{1}{6}\left[\begin{array}{cc}6 T & 3 T(T+1) \\ 3 T(T+1) & T(T+1)(2 T+1)\end{array}\right]\) b. \(\left(\boldsymbol{X}^{\prime} \boldsymbol{X}\right)^{-1}=\frac{2}{T\left(T^{2}-1\right)}\left[\begin{array}{cc}(T+1)(2 T+1) & -3(T+1) \\ -3(T+1) & 6\end{array}\right]\) c. \(\hat{\beta}_{0}=\frac{2}{T(T-1)}\left[(2 T+1) \sum_{t=1}^{T} y_{t}-3 \sum_{t=1}^{T} t y_{t}\right]\) \[ \hat{\beta}_{1}=\frac{6}{T\left(T^{2}-1\right)}\left[2 \sum_{t=1}^{T} t y_{t}-(T+1) \sum_{t=1}^{T} y_{t}\right] \] d. \(\operatorname{Var}\left(\hat{y}_{t}\right)=\hat{\sigma}^{2}\left[1+\frac{2}{T(T-1)}\left(1-4 T-6 h+6 \frac{(T+h)^{2}}{T+1}\right)\right]\)